Arti Patel (ap8qk@virginia.edu)
Sudeepti Surapaneni(ss9ud@virginia.edu)
Adonis Lu (ayl3yq@virginia.edu)
DS 5001
29 April 2020
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline
from glob import glob
import re
import nltk
import plotly_express as px
# CONFIG
OHCO = ['book_id', 'chap_num', 'para_num', 'sent_num', 'token_num']
epub_dir = 'epubs_'
Since Project Gutenberg texts vary widely in their markup, we define our chunking patterns by hand.
roman = '[IVXLCM]+'
caps = "[A-Z';, -]+"
chap_pats = {
2787: {
'start_line': 21,
'end_line': 11235,
#'chapter': re.compile("^\s*CHAPTER\s+{}\.\s*$".format(roman))
'chapter': re.compile("^\s*CHAPTER\s+{}\. .*$".format(roman))
},
2726: {
'start_line': 21,
'end_line': 11235,
#'chapter': re.compile("^\s*CHAPTER\s+{}\.\s*$".format(roman))
'chapter': re.compile("^Chapter\s+\d+.+$")
},
2786: {
'start_line': 21,
'end_line': 11235,
'chapter': re.compile("^\s*Chapter\s+{}\. .*$".format(roman))
},
514: {
'start_line': 21,
'end_line': 11235,
'chapter': re.compile('^CHAPTER+\s*{}\s*$'.format(caps))
},
3499: {
'start_line': 21,
'end_line': 10256,
'chapter': re.compile("^Chapter\s+\d+.+$")
},
2788: {
'start_line': 21,
'end_line': 11620,
#'chapter': re.compile('^CHAPTER+\s*{}\s*$'.format(roman))
'chapter': re.compile("^\s*CHAPTER\s+{}\. .*$".format(roman))
},
2804: {
'start_line': 21,
'end_line': 11235,
'chapter': re.compile("^Chapter\s+\d+.+$")
},
3795: {
'start_line': 21,
'end_line': 11235,
'chapter': re.compile('^CHAPTER+\s*{}\s*$'.format(roman))
}
}
def acquire_epubs(epub_list, chap_pats, OHCO=OHCO):
my_lib = []
my_doc = []
for epub_file in epub_list:
# Get PG ID from filename
book_id = int(epub_file.split('-')[-1].split('.')[0].replace('pg',''))
print("BOOK ID", book_id)
# Import file as lines
lines = open(epub_file, 'r', encoding='utf-8-sig').readlines()
df = pd.DataFrame(lines, columns=['line_str'])
df.index.name = 'line_num'
df.line_str = df.line_str.str.strip()
df['book_id'] = book_id
# FIX CHARACTERS TO IMPROVE TOKENIZATION
df.line_str = df.line_str.str.replace('—', ' — ')
df.line_str = df.line_str.str.replace('-', ' - ')
# Get book title and put into LIB table -- note problems, though
book_title = re.sub(r"The Project Gutenberg eBook( of|,) ", "", df.loc[0].line_str, flags=re.IGNORECASE)
#book_title = re.sub(r"The Project Gutenberg eBook( of|,) ", "", df.loc[1].line_str, flags=re.IGNORECASE)
book_title = re.sub(r"Project Gutenberg's ", "", book_title, flags=re.IGNORECASE)
# Remove cruft
a = chap_pats[book_id]['start_line'] - 1
b = chap_pats[book_id]['end_line'] + 1
df = df.iloc[a:b]
# Chunk by chapter
chap_lines = df.line_str.str.match(chap_pats[book_id]['chapter'])
chap_nums = [i+1 for i in range(df.loc[chap_lines].shape[0])]
df.loc[chap_lines, 'chap_num'] = chap_nums
df.chap_num = df.chap_num.ffill()
# Clean up
df = df[~df.chap_num.isna()] # Remove chapter heading lines
df = df.loc[~chap_lines] # Remove everything before Chapter 1
df['chap_num'] = df['chap_num'].astype('int')
# Group -- Note that we exclude the book level in the OHCO at this point
df = df.groupby(OHCO[1:2]).line_str.apply(lambda x: '\n'.join(x)).to_frame() # Make big string
# Split into paragrpahs
df = df['line_str'].str.split(r'\n\n+', expand=True).stack().to_frame().rename(columns={0:'para_str'})
df.index.names = OHCO[1:3] # MAY NOT BE NECESSARY UNTIL THE END
df['para_str'] = df['para_str'].str.replace(r'\n', ' ').str.strip()
df = df[~df['para_str'].str.match(r'^\s*$')] # Remove empty paragraphs
# Set index
df['book_id'] = book_id
df = df.reset_index().set_index(OHCO[:3])
# Register
my_lib.append((book_id, book_title, epub_file))
my_doc.append(df)
docs = pd.concat(my_doc)
library = pd.DataFrame(my_lib, columns=['book_id', 'book_title', 'book_file']).set_index('book_id')
print("Done.")
return library, docs
epubs = [epub for epub in sorted(glob(epub_dir+'/*.txt'))]
LIB, DOC = acquire_epubs(epubs, chap_pats)
LIB_arti = pd.read_csv('LIB_arti.csv')
DOC_arti = pd.read_csv('DOC_arti.csv')
DOC_arti = DOC_arti.rename(columns = {'story_num':'chap_num'})
DOC_arti = DOC_arti.set_index(['book_id','chap_num','para_num'])
LIB['book_id'] = LIB.index
LIB = LIB.append(LIB_arti)
DOC = DOC.append(DOC_arti)
LIB.index = LIB['book_id']
LIB
DOC.sample(10)
We use NLTK this time. Note that this process takes some time, mainly because the NLTK functions are not optimized for dataframes.
Note that we can choose between tokenizers. NLTK offers a variety of them. Here is a list.
def tokenize(doc_df, OHCO=OHCO, remove_pos_tuple=False, ws=False):
# Paragraphs to Sentences
df = doc_df.para_str\
.apply(lambda x: pd.Series(nltk.sent_tokenize(x)))\
.stack()\
.to_frame()\
.rename(columns={0:'sent_str'})
# Sentences to Tokens
# Local function to pick tokenizer
def word_tokenize(x):
if ws:
s = pd.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x)))
else:
s = pd.Series(nltk.pos_tag(nltk.word_tokenize(x))) # Discards stuff in between
return s
df = df.sent_str\
.apply(word_tokenize)\
.stack()\
.to_frame()\
.rename(columns={0:'pos_tuple'})
# Grab info from tuple
df['pos'] = df.pos_tuple.apply(lambda x: x[1])
df['token_str'] = df.pos_tuple.apply(lambda x: x[0])
if remove_pos_tuple:
df = df.drop('pos_tuple', 1)
# Add index
df.index.names = OHCO
return df
%%time
TOKEN = tokenize(DOC, ws=False)
TOKEN.head()
TOKEN[TOKEN.pos.str.match('^NNP')]
Extract a vocabulary from the TOKEN table
TOKEN['term_str'] = TOKEN['token_str'].str.lower().str.replace('[\W_]', '')
VOCAB = TOKEN.term_str.value_counts().to_frame()\
.rename(columns={'index':'term_str', 'term_str':'n'})\
.sort_index().reset_index().rename(columns={'index':'term_str'})
VOCAB.index.name = 'term_id'
VOCAB['num'] = VOCAB.term_str.str.match("\d+").astype('int')
VOCAB
We use NLTK's built in stopword list for English. Note that we can add and subtract from this list, or just create our own list and keep it in our data model.
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1
sw.sample(10)
VOCAB['stop'] = VOCAB.term_str.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')
VOCAB[VOCAB.stop == 1].sample(10)
from nltk.stem.porter import PorterStemmer
stemmer1 = PorterStemmer()
VOCAB['stem_porter'] = VOCAB.term_str.apply(stemmer1.stem)
VOCAB.sample(10)
VOCAB.index.name = 'term_id'
MFCB = TOKEN.groupby(['term_str','pos']).pos.count().unstack().idxmax(1).to_frame()
VOCAB = pd.merge(VOCAB,MFCB, left_on='term_str', right_on = 'term_str')
VOCAB = VOCAB.rename(columns = {0:'pos_max'})
VOCAB.sample(5)
VOCAB.index.name = 'term_id'
DOC.to_csv('DOC.csv')
LIB.to_csv('LIBRARY.csv')
VOCAB.to_csv('VOCAB.csv')
TOKEN.to_csv('TOKEN.csv')
count_method = 'n' # 'c' or 'n' # n = n tokens, c = distinct token (term) count
tf_method = 'sum' # sum, max, log, double_norm, raw, binary
tf_norm_k = .5 # only used for double_norm
idf_method = 'standard' # standard, max, smooth
gradient_cmap = 'YlGnBu' # YlGn, GnBu, YlGnBu; For tables; see https://matplotlib.org/3.1.0/tutorials/colors/colormaps.html
OHCO = ['book_id', 'chap_num', 'para_num', 'sent_num', 'token_num']
SENTS = OHCO[:4]
PARAS = OHCO[:3]
CHAPS = OHCO[:2]
BOOKS = OHCO[:1]
bag = CHAPS
pd.__version__
sns.set()
%matplotlib inline
Bring the the tables we created last time.
%%time
LIB = pd.read_csv("LIBRARY.csv").set_index(BOOKS)
TOKEN = pd.read_csv('TOKEN.csv').set_index(OHCO)
VOCAB = pd.read_csv('VOCAB.csv').set_index('term_id')
# DOC = pd.read_csv(data_dir + "DOC.csv").set_index(PARAS)
LIB = LIB[['book_title', 'book_file']]
LIB
VOCAB.head()
VOCAB = VOCAB[~VOCAB.term_str.isna()]
VOCAB.sample(5)
TOKEN.head()
TOKEN = TOKEN[~TOKEN.term_str.isna()]
TOKEN.head()
# DOC.head()
We need to do this to combine the VOCAB and TOKEN tables more efficiently. Note, we could have done this in the previous lab.
We use .map() because TOKEN and VOCAB do not share an index at this time.
TOKEN['term_id'] = TOKEN.term_str.map(VOCAB.reset_index().set_index('term_str').term_id)
TOKEN.head()
Just in case it's not there. It's easy now that we have a share feature -- term_id -- between VOCAB and TOKEN.
Regarding collisions when using .idxmax(), the documentation says "If multiple values equal the maximum, the first row label with that value is returned."
# Demo
# TOKEN.groupby(['term_id', 'pos']).pos.count()
# TOKEN.groupby(['term_id', 'pos']).pos.count().unstack()
# TOKEN.groupby(['term_id', 'pos']).pos.count().unstack().idxmax(1)
VOCAB['pos_max'] = TOKEN.groupby(['term_id', 'pos']).pos.count().unstack().idxmax(1)
VOCAB.sample(5)
Pause and look at distribution of POS tags. The POS table could become part of your data model (analytical edition) if you were interested in studying POS tags.
POS = TOKEN.pos.value_counts().to_frame().rename(columns={'pos':'n'})
POS.index.name = 'pos_id'
POS.sort_values('n').plot.bar(y='n', figsize=(15,5), rot=45);
$f \propto \frac{1}{r} $
$k = fr$
if 'term_rank' not in VOCAB.columns:
VOCAB = VOCAB.sort_values('n', ascending=False).reset_index()
VOCAB.index.name = 'term_rank'
VOCAB = VOCAB.reset_index()
VOCAB = VOCAB.set_index('term_id')
VOCAB['term_rank'] = VOCAB['term_rank'] + 1
VOCAB.head()
The term_rank as defined above assigns different ranks to words with the same frequency, which occurs in the long tail, e.g. with words that appear once.
This measure groups words by term count.
new_rank = VOCAB.n.value_counts()\
.sort_index(ascending=False).reset_index().reset_index()\
.rename(columns={'level_0':'term_rank2', 'index':'n', 'n':'nn'})\
.set_index('n')
new_rank.head()
VOCAB['term_rank2'] = VOCAB.n.map(new_rank.term_rank2) + 1
VOCAB.head()
VOCAB['p'] = VOCAB.n / VOCAB.shape[0]
VOCAB['zipf_k'] = VOCAB.n * VOCAB.term_rank
VOCAB['zipf_k2'] = VOCAB.n * VOCAB.term_rank2
VOCAB['zipf_k3'] = VOCAB.p * VOCAB.term_rank2
VOCAB.describe().T
VOCAB[VOCAB.zipf_k <= VOCAB.zipf_k.quantile(.1)].sort_values('zipf_k3', ascending=True).head()
VOCAB[VOCAB.zipf_k >= VOCAB.zipf_k.quantile(.9)].sort_values('zipf_k3', ascending=False).head()
# px.histogram(VOCAB, 'zipf_k', marginal='box')
# px.histogram(VOCAB, 'zipf_k2', marginal='box')
# px.histogram(VOCAB, 'zipf_k3', marginal='box')
VSAMP1 = VOCAB[['n','term_rank','zipf_k','term_str','pos_max']]
VSAMP2 = VOCAB[['n','term_rank2','zipf_k3']].drop_duplicates()
# px.scatter(VSAMP1, x='term_rank', y='n', log_y=False, log_x=False, hover_name='term_str', color='pos_max')
# px.scatter(VSAMP2, x='term_rank2', y='n', log_y=False, log_x=False)
# px.scatter(VSAMP1, x='term_rank', y='n', log_y=True, log_x=True, hover_name='term_str', color='pos_max')
# px.scatter(VSAMP2, x='term_rank2', y='n', log_y=True, log_x=True)
rank_index = [1, 2, 3, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000]
demo = VOCAB.loc[VOCAB.term_rank.isin(rank_index), ['term_str', 'term_rank', 'n', 'zipf_k', 'pos_max']]
demo.style.background_gradient(cmap=gradient_cmap, high=.5)
This is the prior, or marginal, probability of a term.
%%time
VOCAB['p2'] = VOCAB.n / VOCAB.n.sum()
VOCAB['h'] = VOCAB.p2 * np.log2(1/VOCAB.p2) # Self entropy of each word
H = VOCAB.h.sum()
N_v = VOCAB.shape[0]
H_max = np.log2(N_v)
R = round(1 - (H/H_max), 2) * 100
print("H \t= {}\nH_max \t= {}\nR \t= {}%".format(H, H_max, int(R)))
BOW = TOKEN.groupby(bag+['term_id']).term_id.count()\
.to_frame().rename(columns={'term_id':'n'})
BOW['c'] = BOW.n.astype('bool').astype('int')
BOW.head(10)
BOW.to_csv('BOW.csv')
We create a document-term count matrix. Note that we can create a matrix for any of the features in BOW. Also, see how the OHCO helps us distinguish between features and observation identity.
Note, these operations are slower than using groupby().
%%time
DTCM = BOW[count_method].unstack().fillna(0).astype('int')
DTCM.head()
We could also compute that using BOW.groupby().
%%time
print('TF method:', tf_method)
if tf_method == 'sum':
TF = DTCM.T / DTCM.T.sum()
elif tf_method == 'max':
TF = DTCM.T / DTCM.T.max()
elif tf_method == 'log':
TF = np.log10(1 + DTCM.T)
elif tf_method == 'raw':
TF = DTCM.T
elif tf_method == 'double_norm':
TF = DTCM.T / DTCM.T.max()
TF = tf_norm_k + (1 - tf_norm_k) * TF[TF > 0] # EXPLAIN; may defeat purpose of norming
elif tf_method == 'binary':
TF = DTCM.T.astype('bool').astype('int')
TF = TF.T
TF.head()
%%time
DF = DTCM[DTCM > 0].count()
DF.head()
N = DTCM.shape[0]
print('IDF method:', idf_method)
if idf_method == 'standard':
IDF = np.log10(N / DF)
elif idf_method == 'max':
IDF = np.log10(DF.max() / DF)
elif idf_method == 'smooth':
IDF = np.log10((1 + N) / (1 + DF)) + 1 # Correct?
TFIDF = TF * IDF
TFIDF.head()
VOCAB['df'] = DF
VOCAB['idf'] = IDF
VOCAB.head()
%%time
BOW['tf'] = TF.stack()
BOW['tfidf'] = TFIDF.stack()
BOW.head()
VOCAB['tfidf_sum'] = TFIDF.sum()
VOCAB.sort_values('tfidf_sum', ascending=False).head(20).style.background_gradient(cmap=gradient_cmap, high=1)
VOCAB[['term_rank','term_str','pos_max','tfidf_sum']]\
.sort_values('tfidf_sum', ascending=False).head(50)\
.style.background_gradient(cmap=gradient_cmap, high=1)
VOCAB.loc[VOCAB.pos_max != 'NNP', ['term_rank','term_str','pos_max','tfidf_sum']]\
.sort_values('tfidf_sum', ascending=False)\
.head(50).style.background_gradient(cmap=gradient_cmap, high=1)
BOW = BOW.join(VOCAB[['term_str','pos_max']], on='term_id')
BOW.sort_values('tfidf', ascending=False).head(20)\
.style.background_gradient(cmap=gradient_cmap, high=1)
px.scatter(VOCAB, x='term_rank', y='tfidf_sum', hover_name='term_str', hover_data=['n'], color='pos_max')
px.scatter(VOCAB, x='term_rank2', y='tfidf_sum', hover_name='term_str', hover_data=['n'], color='pos_max')
# px.scatter(VOCAB, x='term_rank', y='tfidf_sum', hover_name='term_str', hover_data=['n'], color='pos_max', log_x=True, log_y=True)
# px.scatter(VOCAB, x='term_rank2', y='tfidf_sum', hover_name='term_str', hover_data=['n'], color='pos_max',
# log_x=True, log_y=True)
demo2 = VOCAB.loc[VOCAB.term_rank.isin(rank_index), ['term_str', 'pos_max', 'term_rank', 'n', 'zipf_k', 'tfidf_sum']]
demo2.style.background_gradient(cmap=gradient_cmap, high=1)
px.scatter(demo2, x='term_rank', y='tfidf_sum', log_x=True, log_y=True, text='term_str', color='pos_max', size='n')
WCM = DTCM / DTCM.sum()
WCM.sum().head()
WCMh = WCM * np.log2(1/WCM)
VOCAB['h2'] = WCMh.sum()
VOCAB['h2'].hist();
VOCAB['x_factor'] = np.log(VOCAB.term_rank) * VOCAB.h2
px.scatter(VOCAB, x='term_rank', y='x_factor', hover_name='term_str', color='pos_max', hover_data=['n'])
VOCAB['x_factor2'] = np.log(VOCAB.term_rank2) * VOCAB.h2
px.scatter(VOCAB, x='term_rank2', y='x_factor2', hover_name='term_str', color='pos_max', hover_data=['n'])
# px.scatter(VOCAB, x='term_rank', y='x_factor', log_x=True, log_y=True, hover_name='term_str', color='pos_max', hover_data=['n'])
# px.scatter(VOCAB, x='term_rank2', y='x_factor2', log_x=True, log_y=True, hover_name='term_str', color='pos_max', hover_data=['n'])
demo3 = VOCAB.loc[VOCAB.term_rank.isin(rank_index), ['term_str', 'pos_max', 'n', 'term_rank', 'zipf_k', 'tfidf_sum', 'h2', 'x_factor', 'term_rank2', 'x_factor2']]
demo3.style.background_gradient(cmap=gradient_cmap)
# px.scatter(demo3, x='term_rank', y='x_factor', log_x=True, log_y=True, text='term_str', color='pos_max', size='n')
px.scatter(demo3, x='term_rank2', y='x_factor2', log_x=False, log_y=False, text='term_str', color='pos_max', size='n')
We want to take the upper and middle segment of our graph.
# key_col = 'tfidf_sum'
key_col = 'x_factor2'
key_min = VOCAB[key_col].quantile(.9)
rank_min = 200
SIGS = VOCAB.loc[(VOCAB[key_col] >= key_min) & (VOCAB.term_rank >= rank_min)].sort_values(key_col, ascending=False)
SIGS.shape[0]
SIGS[['pos_max', 'term_str', 'n', 'term_rank', 'zipf_k', 'df', 'idf', 'tfidf_sum','x_factor2']].head(100).style.background_gradient(cmap=gradient_cmap, high=1)
VOCAB.to_csv('VOCAB2.csv')
TOKEN.to_csv('TOKEN2.csv')
BOW.to_csv('DOC2.csv')
DTCM.to_csv('DTCM.csv')
TFIDF.to_csv('TFIDF.csv')
#SIGS.to_csv('SIGS.csv')
#WCM.to_csv('WCM.csv')
# BOW.to_csv('BOW.csv')
We define two OHCO lists, one to match the reduced TFIDF table we are importing, and the other to define the table after we compress this table to make clustering easier.
OHCO_src = ['book_id', 'chap_num']
OHCO = ['book_id']
import pandas as pd
import numpy as np
import re
from numpy.linalg import norm
from scipy.spatial.distance import pdist
import seaborn as sns
sns.set(style="ticks")
%matplotlib inline
TFIDF = pd.read_csv('TFIDF.csv').set_index(OHCO_src)
VOCAB = pd.read_csv('VOCAB.csv').set_index('term_id')
LIB = pd.read_csv('LIBRARY.csv').set_index('book_id')
TFIDF.head()
LIB.loc[LIB.book_title.str.contains('Alcott'), 'author'] = 'Alcott'
LIB['author'] = LIB['author'].fillna('Poe')
LIB['title'] = LIB.book_title.str.split(', by').apply(lambda x: x[0])
LIB.columns = ['book_id','book_title','book_file','author','title']
LIB
We want to work with larger bags in this notebook, in order to better visualize our resulting clusters.
TFIDF = TFIDF.groupby(OHCO).mean()
TFIDF
We want to create a new table that maps the OHCO levels to a single doc_id. We do this so that when we create a table to store pairs of docs and their distances, we can use a single-valued ID for each docs.
This table will also be used to store cluster assignments.
All of this will become clearer below!
DOC = TFIDF.reset_index()[OHCO] # We create a table from the OHCO in our TFIDF table
DOC.index.name = 'doc_id' # We give the new index a name
DOC.head()
DOC['title'] = DOC.book_id.map(LIB.author) \
+ '-' + DOC[OHCO].apply(lambda x: x.astype('str').str.cat(sep='-'), 1) \
+ ': '+ DOC.book_id.map(LIB.title)
DOC
L0 = TFIDF.astype('bool').astype('int')
L1 = TFIDF.apply(lambda x: x / x.sum(), 1)
L2 = TFIDF.apply(lambda x: x / norm(x), 1)
Create a table to store our results.
Note that pist() is a "distance matrix computation from a collection of raw observation vectors stored in a rectangular array".
PAIRS = pd.DataFrame(index=pd.MultiIndex.from_product([DOC.index.tolist(), DOC.index.tolist()])).reset_index()
PAIRS = PAIRS[PAIRS.level_0 < PAIRS.level_1].set_index(['level_0','level_1'])
PAIRS.index.names = ['doc_a', 'doc_b']
PAIRS.shape
PAIRS.head()
%time PAIRS['cityblock'] = pdist(TFIDF, 'cityblock')
%time PAIRS['euclidean'] = pdist(TFIDF, 'euclidean')
%time PAIRS['cosine'] = pdist(TFIDF, 'cosine')
%time PAIRS['jaccard'] = pdist(L0, 'jaccard') # Fast, and similar to js
%time PAIRS['dice'] = pdist(L0, 'dice')
# %time PAIRS['js'] = pdist(L1, 'jensenshannon') # Turns out to be really slow
%time PAIRS['euclidean2'] = pdist(L2, 'euclidean') # Should be the same as cosine (colinear)
%time PAIRS['js'] = pdist(TFIDF, 'jensenshannon')
import scipy
scipy.__version__
PAIRS.head()
if PAIRS.shape[0] > 1000:
SAMPLE = PAIRS.sample(1000)
else:
SAMPLE = PAIRS
sns.pairplot(SAMPLE)
PAIRS.sort_values('cosine').head(20).style.background_gradient('YlGn', high=1)
import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt
def hca(sims, linkage_method='ward', color_thresh=.3, figsize=(10, 10)):
tree = sch.linkage(sims, method=linkage_method)
labels = DOC.title.values
plt.figure()
fig, axes = plt.subplots(figsize=figsize)
dendrogram = sch.dendrogram(tree,
labels=labels,
orientation="left",
count_sort=True,
distance_sort=True,
above_threshold_color='.75',
color_threshold=color_thresh
)
plt.tick_params(axis='both', which='major', labelsize=14)
hca(PAIRS.cosine, color_thresh=1)
hca(PAIRS.jaccard, color_thresh=.6)
hca(PAIRS.euclidean, color_thresh=.3)
hca(PAIRS.cityblock, color_thresh=8)
hca(PAIRS.js, color_thresh=.6)
K-Means only uses Euclidean distance. Why?
See the Cross Validated post on this.
from sklearn.cluster import KMeans
n_clusters = 4
DOC['y_raw'] = KMeans(n_clusters).fit_predict(TFIDF)
DOC['y_L0'] = KMeans(n_clusters).fit_predict(L0)
DOC['y_L1'] = KMeans(n_clusters).fit_predict(L1)
DOC['y_L2'] = KMeans(n_clusters).fit_predict(L2)
DOC.sort_values('y_raw').style.background_gradient(cmap='YlGn', high=1)
DOC.to_csv('DOC3.csv')
LIB.to_csv('LIB3.csv')
TFIDF.to_csv('TFIDF_book_final.csv')
OHCO = ['book_id', 'chap_num']
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from scipy.linalg import norm
import plotly_express as px
import seaborn as sns
sns.set(style='ticks')
%matplotlib inline
LIB = pd.read_csv('LIB3.csv').set_index('book_id')
VOCAB = pd.read_csv('VOCAB2.csv').set_index('term_id')
TFIDF = pd.read_csv('TFIDF.csv').set_index(OHCO)
TFIDF.shape
We use L2 normalization
TFIDF = TFIDF.apply(lambda x: x / np.sqrt(np.square(x).sum()), 1)
We do not normalize variance, which we would normally do, such as with data containing divergent units of measure. \ This is because to do so would exaggerate the importance of rare words (see Ng, 2008: 6m40s — 8m00s).
Note that we are taking the column-wise means -- the means for the term vectors. \ We don't really need to do this. But it is typical for PCA. \ NOTE: Some argue that centering alters the cosine angles.
TFIDF = TFIDF - TFIDF.mean()
$n = |X| = |Y|$
$Cov(X,Y) = \dfrac{\sum_{i=1}^{n} (x_i - \mu_X) (y_i - \mu_Y)}{n - 1} = \dfrac{XY}{n-1}$
We could we use the built in Pandas method here, but compute it ourselves.
# COV = TFIDF.cov() # This also centers the vectors
COV = TFIDF.T.dot(TFIDF) / (TFIDF.shape[0] - 1)
COV.head()
COV.iloc[:5,:10].style.background_gradient()
There a at least three options to choose from. We go with SciPy's Hermitian Eigendecomposition \
method eigh(), since our covarience matrix is symmetric.
from scipy.linalg import eigh
%time eig_vals, eig_vecs = eigh(COV)
TERM_IDX = COV.index
EIG_VEC = pd.DataFrame(eig_vecs, index=TERM_IDX, columns=TERM_IDX)
EIG_VAL = pd.DataFrame(eig_vals, index=TERM_IDX, columns=['eig_val'])
EIG_VAL.index.name = 'term_id'
EIG_VEC.iloc[:5, :10].style.background_gradient()
EIG_VAL.iloc[:5]
Next, we associate each eigenvalue with its corresponding column in the eigenvalue matrix. \
This is why we transpose the EIG_VEC dataframe.
EIG_PAIRS = EIG_VAL.join(EIG_VEC.T)
EIG_PAIRS.head()
Next, we sort in descending order and pick the top K (=10).
We might have usd this value to sort our components.
EIG_PAIRS['exp_var'] = np.round((EIG_PAIRS.eig_val / EIG_PAIRS.eig_val.sum()) * 100, 2)
EIG_PAIRS.exp_var.sort_values(ascending=False).head().plot.bar(rot=45)
We pick these based on explained variance.
COMPS = EIG_PAIRS.sort_values('exp_var', ascending=False).head(10).reset_index(drop=True)
COMPS.index.name = 'comp_id'
COMPS.index = ["PC{}".format(i) for i in COMPS.index.tolist()]
COMPS
VOCAB.loc[[int(x) for x in EIG_PAIRS.sort_values('exp_var', ascending=False).head(10).index], 'term_str']
Loadings sow the contribution of each term to the component. \ We'll just look at the topi 10 words for the first two components in the Book version.
LOADINGS = COMPS[TERM_IDX].T
LOADINGS.index.name = 'term_id'
LOADINGS.head().style.background_gradient()
LOADINGS['term_str'] = LOADINGS.apply(lambda x: VOCAB.loc[int(x.name)].term_str, 1)
# l0_pos = LOADINGS.sort_values('PC0', ascending=True).head(10).term_str.str.cat(sep=' ')
# l0_neg = LOADINGS.sort_values('PC0', ascending=False).head(10).term_str.str.cat(sep=' ')
# l1_pos = LOADINGS.sort_values('PC1', ascending=True).head(10).term_str.str.cat(sep=' ')
# l1_neg = LOADINGS.sort_values('PC1', ascending=False).head(10).term_str.str.cat(sep=' ')
# print('Books PC0+', l0_pos)
# print('Books PC0-', l0_neg)
# print('Books PC1+', l1_pos)
# print('Books PC1-', l1_neg)
We get the dot product of the DTM matrix and the new component matrix, which we will call DCM -- for document-component matrix. \ This has the effect of replacing the features of the DTM with the features of the transposed component matrix.
DCM = TFIDF.dot(COMPS[TERM_IDX].T)
DCM
We add metadata to our new, reduced matrices for display purposes.
DCM = DCM.join(LIB[['author','title']], on='book_id')
DCM['doc'] = DCM.apply(lambda x: "{}-{}-{}".format(x.author, x.title, x.name[1]), 1)
DCM.head().style.background_gradient()
def vis_pcs(M, a, b, label='author', prefix='PC'):
fig = px.scatter(M, prefix + str(a), prefix + str(b),
color=label,
hover_name='doc', marginal_x='box')
fig.show()
vis_pcs(DCM, 0, 1)
vis_pcs(DCM, 0, 1, label='title')
vis_pcs(DCM, 1, 2)
vis_pcs(DCM, 1, 2, label='title')
vis_pcs(DCM, 2, 3)
vis_pcs(DCM, 2, 3, label='title')
vis_pcs(DCM, 3, 4, label='author')
vis_pcs(DCM, 3, 4, label='title')
pca_engine = PCA(n_components=10)
DCM_sk = pd.DataFrame(pca_engine.fit_transform(TFIDF), index=TFIDF.index)
DCM_sk.columns = ['PC{}'.format(i) for i in DCM_sk.columns]
DCM_sk = DCM_sk.join(LIB[['author','title']], on='book_id')
DCM_sk['doc'] = DCM_sk.apply(lambda x: "{}-{}-{}".format(x.author, x.title, x.name[1]), 1)
DCM_sk.head().style.background_gradient()
vis_pcs(DCM_sk, 0, 1)
vis_pcs(DCM_sk, 0, 1)
px.scatter_3d(DCM_sk, 'PC0', 'PC1','PC2', color='title', hover_name='doc', height=1000, width=1200)
LOADINGS_sk = pd.DataFrame(pca_engine.components_.T * np.sqrt(pca_engine.explained_variance_))
LOADINGS_sk.columns = ["PC{}".format(i) for i in LOADINGS_sk.columns]
LOADINGS_sk.index = TFIDF.columns
LOADINGS_sk.index.name = 'term_id'
LOADINGS_sk['term_str'] = LOADINGS_sk.apply(lambda x: VOCAB.loc[int(x.name)].term_str, 1)
# pc0_pos = LOADINGS_sk.sort_values('PC0', ascending=False).head(10).term_str.str.cat(sep=' ')
# pc0_neg = LOADINGS_sk.sort_values('PC0', ascending=True).head(10).term_str.str.cat(sep=' ')
# pc1_pos = LOADINGS_sk.sort_values('PC1', ascending=False).head(10).term_str.str.cat(sep=' ')
# pc1_neg = LOADINGS_sk.sort_values('PC1', ascending=True).head(10).term_str.str.cat(sep=' ')
# print('BOOKS PC0+', pc0_pos)
# print('BOOKS PC0-', pc0_neg)
# print('BOOKS PC1+', pc1_pos)
# print('BOOKS PC1-', pc1_neg)
DCM.to_csv('PCA_DCM_books.csv')
COMPS.to_csv('PCA_TCM_books.csv')
LOADINGS.to_csv('LOADINGS.csv')
n_terms = 4000
n_topics = 30
# raf - do minimum of 20
max_iter = 5
OHCO = ['book_id', 'chap_num', 'para_num']
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
%matplotlib inline
Scikit Learn wants an F1 style corpus. We create onefrom our annotated TOKEN table, keeping only regular nouns.
TOKENS = pd.read_csv('TOKEN2.csv')
TOKENS.head()
PARAS = TOKENS[TOKENS.pos.str.match(r'^NNS?$')]\
.groupby(OHCO).term_str\
.apply(lambda x: ' '.join(x))\
.to_frame()\
.rename(columns={'term_str':'para_str'})
PARAS.head()
We use Scikit Learn's CountVectorizer to convert our F1 corpus of paragraphs into a document-term vector space of word counts.
tfv = CountVectorizer(max_features=n_terms, stop_words='english')
tf = tfv.fit_transform(PARAS.para_str)
TERMS = tfv.get_feature_names()
We run Scikit Learn's LatentDirichletAllocation algorithm and extract the THETA and PHI tables.
lda = LDA(n_components=n_topics, max_iter=max_iter, learning_offset=50., random_state=0)
THETA = pd.DataFrame(lda.fit_transform(tf), index=PARAS.index)
THETA.columns.name = 'topic_id'
THETA.sample(20).style.background_gradient()
PHI = pd.DataFrame(lda.components_, columns=TERMS)
PHI.index.name = 'topic_id'
PHI.columns.name = 'term_str'
PHI.T.head().style.background_gradient()
TOPICS = PHI.stack().to_frame().rename(columns={0:'weight'})\
.groupby('topic_id')\
.apply(lambda x:
x.weight.sort_values(ascending=False)\
.head(10)\
.reset_index()\
.drop('topic_id',1)\
.term_str)
TOPICS
TOPICS['label'] = TOPICS.apply(lambda x: str(x.name) + ' ' + ' '.join(x), 1)
TOPICS['doc_weight_sum'] = THETA.sum()
TOPICS.sort_values('doc_weight_sum', ascending=True).plot.barh(y='doc_weight_sum', x='label', figsize=(5,10))
LIB = pd.read_csv('LIB3.csv').set_index('book_id')
topic_cols = [t for t in range(n_topics)]
AUTHORS = THETA.join(LIB, on='book_id')\
.reset_index().set_index(['author']+OHCO)\
.groupby('author')[topic_cols].mean()\
.T
AUTHORS.index.name = 'topic_id'
AUTHORS['topterms'] = TOPICS[[i for i in range(10)]].apply(lambda x: ' '.join(x), 1)
AUTHORS.sort_values('Alcott', ascending=False).style.background_gradient()
AUTHORS.sort_values('Poe', ascending=False).style.background_gradient()
import plotly_express as px
# px.scatter(AUTHORS.reset_index(), 'Alcott', 'Poe', hover_name='topterms', text='topic_id')\
# .update_traces(mode='text')
import scipy.cluster.hierarchy as sch
from scipy.spatial.distance import pdist
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
def plot_tree(tree, labels):
plt.figure()
fig, axes = plt.subplots(figsize=(5, 10))
dendrogram = sch.dendrogram(tree, labels=labels, orientation="left")
plt.tick_params(axis='both', which='major', labelsize=14)
SIMS = pdist(normalize(PHI), metric='euclidean')
TREE = sch.linkage(SIMS, method='ward')
labels = ["{}: {}".format(a,b) for a, b in zip(AUTHORS.index, AUTHORS.topterms.tolist())]
plot_tree(TREE, labels)
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda, tf, tfv)
PHI.T
VOCAB
topics_terms = pd.concat([VOCAB.reset_index().set_index('term_str'), PHI.T], join='inner', axis=1)\
.reset_index().set_index('term_id')
topics_terms
THETA.to_csv('LDA_docs_topics.csv')
topics_terms.to_csv('LDA_topics_terms.csv')
OHCO = ['book_id', 'chap_num', 'para_num', 'sent_num', 'token_num']
BAG = OHCO[:4] # Paragraphs
# BAG = OHCO[:5] # Sentences
window = 5
from gensim.models import word2vec
from sklearn.manifold import TSNE
import plotly_express as px
%matplotlib inline
We import data from the TOKEN table of the novels corpus, excluding proper nouns.
TOKENS = pd.read_csv('TOKEN2.csv').set_index(OHCO)
corpus = TOKENS[~TOKENS.pos.str.match('NNPS?')]\
.groupby(BAG)\
.term_str.apply(lambda x: x.tolist())\
.reset_index()['term_str'].tolist()
model = word2vec.Word2Vec(corpus, size=246, window=window, min_count=200, workers=4)
coords = pd.DataFrame(index=range(len(model.wv.vocab)))
coords['label'] = [w for w in model.wv.vocab]
coords['vector'] = coords['label'].apply(lambda x: model.wv.get_vector(x))
coords.head()
tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
tsne_values = tsne_model.fit_transform(coords['vector'].tolist())
coords['x'] = tsne_values[:,0]
coords['y'] = tsne_values[:,1]
coords.head()
px.scatter(coords, 'x', 'y', text='label', height=1000).update_traces(mode='text')
coords = coords.rename(columns={'label':'term_str'})
coords = coords[['term_str','vector']]
coords = coords.set_index('term_str')
coords.head()
EMBEDDINGS = pd.concat([VOCAB.reset_index().set_index('term_str'), coords], join='inner', axis=1)\
.reset_index().set_index('term_id')
EMBEDDINGS.head()
EMBEDDINGS.to_csv('Term_Embeddings.csv')
$A : B :: C : D? \rightarrow B - A + C = D$
def complete_analogy(A, B, C, n=2):
try:
return model.wv.most_similar(positive=[B, C], negative=[A])[0:n]
except KeyError as e:
print('Error:', e)
return None
complete_analogy('i', 'we', 'you')
complete_analogy('man', 'woman', 'he')
complete_analogy('man','boy','woman')
# set up
#data_dir = 'data/'
novels_csv = 'TOKEN2.csv'
vocab_csv = 'VOCAB2.csv'
lib_csv = 'LIB3.csv'
bow_csv = 'BOW.csv'
# For TOKENS
OHCO = ['book_id', 'chap_num', 'para_num', 'sent_num', 'token_num']
BOOKS = OHCO[:1]
CHAPS = OHCO[:2]
PARAS = OHCO[:3]
SENTS = OHCO[:4]
salex_csv = 'salex_nrc.csv'
emo_cols = "anger anticipation disgust fear joy sadness surprise trust polarity".split()
import pandas as pd
import numpy as np
import seaborn as sns
import plotly_express as px
from IPython.core.display import display, HTML
sns.set()
%matplotlib inline
TOKENS = pd.read_csv(novels_csv).set_index(OHCO)
VOCAB = pd.read_csv(vocab_csv).set_index('term_id')
LIB = pd.read_csv(lib_csv).set_index('book_id')
BOW = pd.read_csv(bow_csv)
# Add token_str to BOW
BOW['term_str'] = BOW.term_id.map(VOCAB.term_str)
BOW = BOW.set_index(['book_id','chap_num','term_id'])
BOW.sort_index()
LIB
# book name and id
old = 2787
eight = 2726
jackjill = 2786
jo = 3499
littleM = 2788
littleW = 514
rose = 2804
under = 3795
poe1 = 1
poe2 = 2
poe3 = 3
poe4 = 4
poe5 = 5
salex_csv = 'salex_nrc.csv'
SALEX = pd.read_csv(salex_csv).set_index('term_str')
SALEX.columns = [col.replace('nrc_','') for col in SALEX.columns]
SALEX['polarity'] = SALEX.positive - SALEX.negative
SALEX
VOCAB.head()
len(VOCAB)
V = pd.concat([VOCAB.reset_index().set_index('term_str'), SALEX], join='inner', axis=1)\
.reset_index().set_index('term_id')
V.head()
len(V)
V.to_csv('Vocab_Sentiment.csv')
BOWIDX = BOW.index
B = BOW.merge(V, on='term_str', how='left')
B.index = BOWIDX
B = B.dropna()
# didnt have tf, tfidf columns
bcols = ['n_x', 'term_str', 'term_rank', 'n_y', 'num', 'pos_max', 'term_rank2', 'p',
'tfidf_sum', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'negative', 'positive', 'sadness', 'surprise', 'trust', 'polarity']
B = B[bcols]
for col in emo_cols:
B[col] = B[col] * B.tfidf_sum # used to be B.tfidf
B.head()
B.tfidf_sum
B.to_csv('Document_Sentiment.csv')
B.pos_max.value_counts().sort_values().plot.barh()
EMO_BOOKS = B.groupby(['book_id'])[emo_cols].mean()
EMO_CHAPS = B.groupby(['book_id','chap_num'])[emo_cols].mean()
EMO_BOOKS.index = LIB.book_title
EMO_BOOKS.plot.bar(figsize=(25,10))
EIGHT = EMO_CHAPS.loc[eight].copy()
JACKJILL = EMO_CHAPS.loc[jackjill].copy()
OLD = EMO_CHAPS.loc[old].copy()
JO = EMO_CHAPS.loc[jo].copy()
LITTLEM = EMO_CHAPS.loc[littleM].copy()
LITTLEW = EMO_CHAPS.loc[littleW].copy()
ROSE = EMO_CHAPS.loc[rose].copy()
UNDER = EMO_CHAPS.loc[under].copy()
POE1 = EMO_CHAPS.loc[poe1].copy()
POE2 = EMO_CHAPS.loc[poe2].copy()
POE3 = EMO_CHAPS.loc[poe3].copy()
POE4 = EMO_CHAPS.loc[poe4].copy()
POE5 = EMO_CHAPS.loc[poe5].copy()
EIGHT.mean().sort_values().plot.barh();
JACKJILL.mean().sort_values().plot.barh();
OLD.mean().sort_values().plot.barh();
JO.mean().sort_values().plot.barh();
LITTLEM.mean().sort_values().plot.barh();
LITTLEW.mean().sort_values().plot.barh();
ROSE.mean().sort_values().plot.barh();
UNDER.mean().sort_values().plot.barh();
POE1.mean().sort_values().plot.barh();
POE2.mean().sort_values().plot.barh();
POE3.mean().sort_values().plot.barh();
POE4.mean().sort_values().plot.barh();
POE5.mean().sort_values().plot.barh();
trust_words = pd.Series(SALEX[SALEX['trust'] == 1].index.tolist())
trust_words.sample(10)
EIGHT_chaps = EMO_CHAPS.loc[eight]
JACKJILL_chaps = EMO_CHAPS.loc[jackjill]
OLD_chaps = EMO_CHAPS.loc[old]
JO_chaps = EMO_CHAPS.loc[jo]
LITTLEM_chaps = EMO_CHAPS.loc[littleM]
LITTLEW_chaps = EMO_CHAPS.loc[littleW]
ROSE_chaps = EMO_CHAPS.loc[rose]
UNDER_chaps = EMO_CHAPS.loc[under]
POE1_chaps = EMO_CHAPS.loc[poe1]
POE2_chaps = EMO_CHAPS.loc[poe2]
POE3_chaps = EMO_CHAPS.loc[poe3]
POE4_chaps = EMO_CHAPS.loc[poe4]
POE5_chaps = EMO_CHAPS.loc[poe5]
EIGHT_chaps.style.background_gradient(cmap='YlGn', high=.25)
JACKJILL_chaps.style.background_gradient(cmap='YlGn', high=.25)
OLD_chaps.style.background_gradient(cmap='YlGn', high=.25)
JO_chaps.style.background_gradient(cmap='YlGn', high=.25)
LITTLEM_chaps.style.background_gradient(cmap='YlGn', high=.25)
LITTLEW_chaps.style.background_gradient(cmap='YlGn', high=.25)
ROSE_chaps.style.background_gradient(cmap='YlGn', high=.25)
UNDER_chaps.style.background_gradient(cmap='YlGn', high=.25)
POE1_chaps.style.background_gradient(cmap='YlGn', high=.25)
POE2_chaps.style.background_gradient(cmap='YlGn', high=.25)
POE3_chaps.style.background_gradient(cmap='YlGn', high=.25)
POE4_chaps.style.background_gradient(cmap='YlGn', high=.25)
POE5_chaps.style.background_gradient(cmap='YlGn', high=.25)
EIGHT_chaps_thin = EMO_CHAPS.stack().to_frame().reset_index().rename(columns={0:'value','level_2':'emo'}).query("book_id == {}".format(eight))
JACKJILL_chaps_thin= EMO_CHAPS.stack().to_frame().reset_index().rename(columns={0:'value','level_2':'emo'}).query("book_id == {}".format(jackjill))
OLD_chaps_thin = EMO_CHAPS.stack().to_frame().reset_index().rename(columns={0:'value','level_2':'emo'}).query("book_id == {}".format(old))
JO_chaps_thin = EMO_CHAPS.stack().to_frame().reset_index().rename(columns={0:'value','level_2':'emo'}).query("book_id == {}".format(jo))
LITTLEM_chaps_thin = EMO_CHAPS.stack().to_frame().reset_index().rename(columns={0:'value','level_2':'emo'}).query("book_id == {}".format(littleM))
LITTLEW_chaps_thin = EMO_CHAPS.stack().to_frame().reset_index().rename(columns={0:'value','level_2':'emo'}).query("book_id == {}".format(littleW))
ROSE_chaps_thin = EMO_CHAPS.stack().to_frame().reset_index().rename(columns={0:'value','level_2':'emo'}).query("book_id == {}".format(rose))
UNDER_chaps_thin = EMO_CHAPS.stack().to_frame().reset_index().rename(columns={0:'value','level_2':'emo'}).query("book_id == {}".format(under))
POE1_chaps_thin = EMO_CHAPS.stack().to_frame().reset_index().rename(columns={0:'value','level_2':'emo'}).query("book_id == {}".format(poe1))
POE2_chaps_thin = EMO_CHAPS.stack().to_frame().reset_index().rename(columns={0:'value','level_2':'emo'}).query("book_id == {}".format(poe2))
POE3_chaps_thin = EMO_CHAPS.stack().to_frame().reset_index().rename(columns={0:'value','level_2':'emo'}).query("book_id == {}".format(poe3))
POE4_chaps_thin = EMO_CHAPS.stack().to_frame().reset_index().rename(columns={0:'value','level_2':'emo'}).query("book_id == {}".format(poe4))
POE5_chaps_thin = EMO_CHAPS.stack().to_frame().reset_index().rename(columns={0:'value','level_2':'emo'}).query("book_id == {}".format(poe5))
def plot_sentiments(df, emo='polarity'):
FIG = dict(figsize=(25, 5), legend=True, fontsize=14, rot=45)
df[emo].plot(**FIG)
# plot_sentiments(PERSUASION_chaps, emo_cols)
px.line(EIGHT_chaps_thin, x='chap_num', y='value', color='emo')
# plot_sentiments(MOBYDICK_chaps, emo_cols)
px.line(JACKJILL_chaps_thin, x='chap_num', y='value', color='emo')
px.line(OLD_chaps_thin, x='chap_num', y='value', color='emo')
px.line(JO_chaps_thin, x='chap_num', y='value', color='emo')
px.line(LITTLEM_chaps_thin, x='chap_num', y='value', color='emo')
px.line(LITTLEW_chaps_thin, x='chap_num', y='value', color='emo')
px.line(ROSE_chaps_thin, x='chap_num', y='value', color='emo')
px.line(UNDER_chaps_thin, x='chap_num', y='value', color='emo')
px.line(POE1_chaps_thin, x='chap_num', y='value', color='emo')
px.line(POE2_chaps_thin, x='chap_num', y='value', color='emo')
px.line(POE3_chaps_thin, x='chap_num', y='value', color='emo')
px.line(POE4_chaps_thin, x='chap_num', y='value', color='emo')
px.line(POE5_chaps_thin, x='chap_num', y='value', color='emo')
We need to do this to reconstruct the sentences, which are lost in the BOW representation.
TOKENSIDX = TOKENS.index
T = TOKENS.merge(V, on='term_str', how='left')
T.index = TOKENSIDX
T = T.fillna(0)
EIGHT2 = T.loc[eight].copy()
JACKJILL2 = T.loc[jackjill].copy()
OLD2 = T.loc[old].copy()
JO2 = T.loc[jo].copy()
LITTLEM2 = T.loc[littleM].copy()
LITTLEW2 = T.loc[littleW].copy()
ROSE2 = T.loc[rose].copy()
UNDER2 = T.loc[under].copy()
POE12 = T.loc[poe1].copy()
POE22 = T.loc[poe2].copy()
POE32 = T.loc[poe3].copy()
POE42 = T.loc[poe4].copy()
POE52 = T.loc[poe5].copy()
emo = 'polarity'
EIGHT2['html'] = EIGHT2.apply(lambda x: "<span class='sent{}'>{}</span>".format(int(np.sign(x[emo])), x.term_str), 1)
JACKJILL2['html'] = JACKJILL2.apply(lambda x: "<span class='sent{}'>{}</span>".format(int(np.sign(x[emo])), x.term_str), 1)
OLD2['html'] = OLD2.apply(lambda x: "<span class='sent{}'>{}</span>".format(int(np.sign(x[emo])), x.term_str), 1)
JO2['html'] = JO2.apply(lambda x: "<span class='sent{}'>{}</span>".format(int(np.sign(x[emo])), x.term_str), 1)
LITTLEM2['html'] = LITTLEM2.apply(lambda x: "<span class='sent{}'>{}</span>".format(int(np.sign(x[emo])), x.term_str), 1)
LITTLEW2['html'] = LITTLEW2.apply(lambda x: "<span class='sent{}'>{}</span>".format(int(np.sign(x[emo])), x.term_str), 1)
ROSE2['html'] = ROSE2.apply(lambda x: "<span class='sent{}'>{}</span>".format(int(np.sign(x[emo])), x.term_str), 1)
UNDER2['html'] = UNDER2.apply(lambda x: "<span class='sent{}'>{}</span>".format(int(np.sign(x[emo])), x.term_str), 1)
POE12['html'] = POE12.apply(lambda x: "<span class='sent{}'>{}</span>".format(int(np.sign(x[emo])), x.term_str), 1)
POE22['html'] = POE22.apply(lambda x: "<span class='sent{}'>{}</span>".format(int(np.sign(x[emo])), x.term_str), 1)
POE32['html'] = POE32.apply(lambda x: "<span class='sent{}'>{}</span>".format(int(np.sign(x[emo])), x.term_str), 1)
POE42['html'] = POE42.apply(lambda x: "<span class='sent{}'>{}</span>".format(int(np.sign(x[emo])), x.term_str), 1)
POE52['html'] = POE52.apply(lambda x: "<span class='sent{}'>{}</span>".format(int(np.sign(x[emo])), x.term_str), 1)
EIGHT2['html'].sample(10)
EIGHT2
EIGHT2_sents = EIGHT2.groupby(SENTS[1:])[emo_cols].mean()
JACKJILL2_sents = JACKJILL2.groupby(SENTS[1:])[emo_cols].mean()
OLD2_sents = OLD2.groupby(SENTS[1:])[emo_cols].mean()
JO2_sents = JO2.groupby(SENTS[1:])[emo_cols].mean()
LITTLEM2_sents = LITTLEM2.groupby(SENTS[1:])[emo_cols].mean()
LITTLEW2_sents = LITTLEW2.groupby(SENTS[1:])[emo_cols].mean()
ROSE2_sents = ROSE2.groupby(SENTS[1:])[emo_cols].mean()
UNDER2_sents = UNDER2.groupby(SENTS[1:])[emo_cols].mean()
POE12_sents = POE12.groupby(SENTS[1:])[emo_cols].mean()
POE22_sents = POE22.groupby(SENTS[1:])[emo_cols].mean()
POE32_sents = POE32.groupby(SENTS[1:])[emo_cols].mean()
POE42_sents = POE42.groupby(SENTS[1:])[emo_cols].mean()
POE52_sents = POE52.groupby(SENTS[1:])[emo_cols].mean()
EIGHT2_sents['sent_str'] = EIGHT2.groupby(SENTS[1:]).term_str.apply(lambda x: x.str.cat(sep=' '))
EIGHT2['html_str'] = EIGHT2.groupby(SENTS[1:]).html.apply(lambda x: x.str.cat(sep=' '))
JACKJILL2_sents['sent_str'] = JACKJILL2.groupby(SENTS[1:]).term_str.apply(lambda x: x.str.cat(sep=' '))
JACKJILL2_sents['html_str'] = JACKJILL2.groupby(SENTS[1:]).html.apply(lambda x: x.str.cat(sep=' '))
OLD2_sents['sent_str'] = OLD2.groupby(SENTS[1:]).term_str.apply(lambda x: x.str.cat(sep=' '))
OLD2_sents['html_str'] = OLD2.groupby(SENTS[1:]).html.apply(lambda x: x.str.cat(sep=' '))
JO2_sents['sent_str'] = JO2.groupby(SENTS[1:]).term_str.apply(lambda x: x.str.cat(sep=' '))
JO2_sents['html_str'] = JO2.groupby(SENTS[1:]).html.apply(lambda x: x.str.cat(sep=' '))
LITTLEM2_sents['sent_str'] = LITTLEM2.groupby(SENTS[1:]).term_str.apply(lambda x: x.str.cat(sep=' '))
LITTLEM2_sents['html_str'] = LITTLEM2.groupby(SENTS[1:]).html.apply(lambda x: x.str.cat(sep=' '))
LITTLEW2_sents['sent_str'] = LITTLEW2.groupby(SENTS[1:]).term_str.apply(lambda x: x.str.cat(sep=' '))
LITTLEW2_sents['html_str'] = LITTLEW2.groupby(SENTS[1:]).html.apply(lambda x: x.str.cat(sep=' '))
ROSE2_sents['sent_str'] = ROSE2.groupby(SENTS[1:]).term_str.apply(lambda x: x.str.cat(sep=' '))
ROSE2_sents['html_str'] = ROSE2.groupby(SENTS[1:]).html.apply(lambda x: x.str.cat(sep=' '))
UNDER2_sents['sent_str'] = UNDER2.groupby(SENTS[1:]).term_str.apply(lambda x: x.str.cat(sep=' '))
UNDER2_sents['html_str'] = UNDER2.groupby(SENTS[1:]).html.apply(lambda x: x.str.cat(sep=' '))
POE12_sents['sent_str'] = POE12.groupby(SENTS[1:]).term_str.apply(lambda x: x.str.cat(sep=' '))
POE12_sents['html_str'] = POE12.groupby(SENTS[1:]).html.apply(lambda x: x.str.cat(sep=' '))
POE22_sents['sent_str'] = POE22.groupby(SENTS[1:]).term_str.apply(lambda x: x.str.cat(sep=' '))
POE22_sents['html_str'] = POE22.groupby(SENTS[1:]).html.apply(lambda x: x.str.cat(sep=' '))
POE32_sents['sent_str'] = POE32.groupby(SENTS[1:]).term_str.apply(lambda x: x.str.cat(sep=' '))
POE32_sents['html_str'] = POE32.groupby(SENTS[1:]).html.apply(lambda x: x.str.cat(sep=' '))
POE42_sents['sent_str'] = POE42.groupby(SENTS[1:]).term_str.apply(lambda x: x.str.cat(sep=' '))
POE42_sents['html_str'] = POE42.groupby(SENTS[1:]).html.apply(lambda x: x.str.cat(sep=' '))
POE52_sents['sent_str'] = POE52.groupby(SENTS[1:]).term_str.apply(lambda x: x.str.cat(sep=' '))
POE52_sents['html_str'] = POE52.groupby(SENTS[1:]).html.apply(lambda x: x.str.cat(sep=' '))
def sample_sentences(df):
rows = []
for idx in df.sample(10).index:
valence = round(df.loc[idx, emo], 4)
t = 0
if valence > t: color = '#ccffcc'
elif valence < t: color = '#ffcccc'
else: color = '#f2f2f2'
z=0
rows.append("""<tr style="background-color:{0};padding:.5rem 1rem;font-size:110%;">
<td>{1}</td><td>{3}</td><td width="400" style="text-align:left;">{2}</td>
</tr>""".format(color, valence, df.loc[idx, 'html_str'], idx))
display(HTML('<style>#sample1 td{font-size:120%;vertical-align:top;} .sent-1{color:red;font-weight:bold;} .sent1{color:green;font-weight:bold;}</style>'))
display(HTML('<table id="sample1"><tr><th>Sentiment</th><th>ID</th><th width="600">Sentence</th></tr>'+''.join(rows)+'</table>'))
# sample_sentences(EIGHT2_sents)
# sample_sentences(JACKJILL2_sents)
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()
EIGHT_vader_cols = EIGHT2_sents.sent_str.apply(analyser.polarity_scores).apply(lambda x: pd.Series(x))
EIGHT_vader = pd.concat([EIGHT2_sents, EIGHT_vader_cols], axis=1)
w = int(EIGHT_vader.shape[0] / 5)
EIGHT_vader[['pos','neg']].rolling(w).mean().plot(figsize=(25,5))
EIGHT_vader[['neu']].rolling(w).mean().plot(figsize=(25,5))
EIGHT_vader[['compound']].rolling(w).mean().plot(figsize=(25,5))
JACKJILL_vader_cols = JACKJILL2_sents.sent_str.apply(analyser.polarity_scores).apply(lambda x: pd.Series(x))
JACKJILL_vader = pd.concat([JACKJILL2_sents, JACKJILL_vader_cols], axis=1)
w = int(JACKJILL_vader.shape[0] / 5)
JACKJILL_vader[['pos','neg']].rolling(w).mean().plot(figsize=(25,5))
JACKJILL_vader[['neu']].rolling(w).mean().plot(figsize=(25,5))
JACKJILL_vader[['compound']].rolling(w).mean().plot(figsize=(25,5))
OLD_vader_cols = OLD2_sents.sent_str.apply(analyser.polarity_scores).apply(lambda x: pd.Series(x))
OLD_vader = pd.concat([OLD2_sents, OLD_vader_cols], axis=1)
w = int(OLD_vader.shape[0] / 5)
OLD_vader[['pos','neg']].rolling(w).mean().plot(figsize=(25,5))
OLD_vader[['neu']].rolling(w).mean().plot(figsize=(25,5))
OLD_vader[['compound']].rolling(w).mean().plot(figsize=(25,5))
JO_vader_cols = JO2_sents.sent_str.apply(analyser.polarity_scores).apply(lambda x: pd.Series(x))
JO_vader = pd.concat([JO2_sents, JO_vader_cols], axis=1)
w = int(JO_vader.shape[0] / 5)
JO_vader[['pos','neg']].rolling(w).mean().plot(figsize=(25,5))
JO_vader[['neu']].rolling(w).mean().plot(figsize=(25,5))
JO_vader[['compound']].rolling(w).mean().plot(figsize=(25,5))
LITTLEM_vader_cols = LITTLEM2_sents.sent_str.apply(analyser.polarity_scores).apply(lambda x: pd.Series(x))
LITTLEM_vader = pd.concat([LITTLEM2_sents, LITTLEM_vader_cols], axis=1)
w = int(LITTLEM_vader.shape[0] / 5)
LITTLEM_vader[['pos','neg']].rolling(w).mean().plot(figsize=(25,5))
LITTLEM_vader[['neu']].rolling(w).mean().plot(figsize=(25,5))
LITTLEM_vader[['compound']].rolling(w).mean().plot(figsize=(25,5))
LITTLEW_vader_cols = LITTLEW2_sents.sent_str.apply(analyser.polarity_scores).apply(lambda x: pd.Series(x))
LITTLEW_vader = pd.concat([LITTLEW2_sents, LITTLEW_vader_cols], axis=1)
w = int(LITTLEW_vader.shape[0] / 5)
LITTLEW_vader[['pos','neg']].rolling(w).mean().plot(figsize=(25,5))
LITTLEW_vader[['neu']].rolling(w).mean().plot(figsize=(25,5))
LITTLEW_vader[['compound']].rolling(w).mean().plot(figsize=(25,5))
ROSE_vader_cols = ROSE2_sents.sent_str.apply(analyser.polarity_scores).apply(lambda x: pd.Series(x))
ROSE_vader = pd.concat([ROSE2_sents, ROSE_vader_cols], axis=1)
w = int(ROSE_vader.shape[0] / 5)
ROSE_vader[['pos','neg']].rolling(w).mean().plot(figsize=(25,5))
ROSE_vader[['neu']].rolling(w).mean().plot(figsize=(25,5))
ROSE_vader[['compound']].rolling(w).mean().plot(figsize=(25,5))
UNDER_vader_cols = UNDER2_sents.sent_str.apply(analyser.polarity_scores).apply(lambda x: pd.Series(x))
UNDER_vader = pd.concat([UNDER2_sents, UNDER_vader_cols], axis=1)
w = int(UNDER_vader.shape[0] / 5)
UNDER_vader[['pos','neg']].rolling(w).mean().plot(figsize=(25,5))
UNDER_vader[['neu']].rolling(w).mean().plot(figsize=(25,5))
UNDER_vader[['compound']].rolling(w).mean().plot(figsize=(25,5))
POE1_vader_cols = POE12_sents.sent_str.apply(analyser.polarity_scores).apply(lambda x: pd.Series(x))
POE1_vader = pd.concat([POE12_sents, POE1_vader_cols], axis=1)
w = int(POE1_vader.shape[0] / 5)
POE1_vader[['pos','neg']].rolling(w).mean().plot(figsize=(25,5))
POE1_vader[['neu']].rolling(w).mean().plot(figsize=(25,5))
POE1_vader[['compound']].rolling(w).mean().plot(figsize=(25,5))
POE2_vader_cols = POE22_sents.sent_str.apply(analyser.polarity_scores).apply(lambda x: pd.Series(x))
POE2_vader = pd.concat([POE22_sents, POE2_vader_cols], axis=1)
w = int(POE2_vader.shape[0] / 5)
POE2_vader[['pos','neg']].rolling(w).mean().plot(figsize=(25,5))
POE2_vader[['neu']].rolling(w).mean().plot(figsize=(25,5))
POE2_vader[['compound']].rolling(w).mean().plot(figsize=(25,5))
POE3_vader_cols = POE32_sents.sent_str.apply(analyser.polarity_scores).apply(lambda x: pd.Series(x))
POE3_vader = pd.concat([POE32_sents, POE3_vader_cols], axis=1)
w = int(POE3_vader.shape[0] / 5)
POE3_vader[['pos','neg']].rolling(w).mean().plot(figsize=(25,5))
POE3_vader[['neu']].rolling(w).mean().plot(figsize=(25,5))
POE3_vader[['compound']].rolling(w).mean().plot(figsize=(25,5))
POE4_vader_cols = POE42_sents.sent_str.apply(analyser.polarity_scores).apply(lambda x: pd.Series(x))
POE4_vader = pd.concat([POE42_sents, POE4_vader_cols], axis=1)
w = int(POE4_vader.shape[0] / 5)
POE4_vader[['pos','neg']].rolling(w).mean().plot(figsize=(25,5))
POE4_vader[['neu']].rolling(w).mean().plot(figsize=(25,5))
POE4_vader[['compound']].rolling(w).mean().plot(figsize=(25,5))
POE5_vader_cols = POE52_sents.sent_str.apply(analyser.polarity_scores).apply(lambda x: pd.Series(x))
POE5_vader = pd.concat([POE52_sents, POE5_vader_cols], axis=1)
w = int(POE5_vader.shape[0] / 5)
POE5_vader[['pos','neg']].rolling(w).mean().plot(figsize=(25,5))
POE5_vader[['neu']].rolling(w).mean().plot(figsize=(25,5))
POE5_vader[['compound']].rolling(w).mean().plot(figsize=(25,5))